package au.com.acpfg.misc.uniprot; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.SocketTimeoutException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.sf.ehcache.Cache; import net.sf.ehcache.CacheManager; import net.sf.ehcache.Element; import net.sf.ehcache.config.CacheConfiguration; import net.sf.ehcache.store.MemoryStoreEvictionPolicy; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpMethodRetryHandler; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.NoHttpResponseException; import org.apache.commons.httpclient.URI; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.DataType; import org.knime.core.data.collection.CollectionCellFactory; import org.knime.core.data.collection.ListCell; import org.knime.core.data.container.DataContainer; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.JoinedRow; import org.knime.core.data.def.StringCell; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.ExecutionContext; import com.fatdog.xmlEngine.exceptions.CantParseDocumentException; import au.com.acpfg.xml.reader.XMLCell; /** * Responsible for retreiving the uniprot record for the user-specifed accessions. Tries very hard to keep getting * data in the face of network disruptions as long-running fetches should not fail under normal circumstances. * Also uses EHCache to cache retrieved records (if wanted by the user) * * @author andrew.cassin * */ public class RetrieveEntryTask implements UniProtTaskInterface { private final int NUM_COLUMNS = 13; // 14 if m_want_xml is true private final static int MAX_RETRIES = 5; // give up after X attempts private final int MAX_UNIPROT_CACHE_ELEMENTS = 1000000; // 1 million uniprot records (max.) in cache (BUG:!?!) private boolean m_want_xml = false; private int m_fetched = 0; protected CacheableUniProtRecord.UniProtDatabase m_db; protected int m_hit; // object caching state (if requested) private int m_cache_freshness; private Cache m_cache; private CacheManager m_cache_mgr; public RetrieveEntryTask(UniProtAccessorNodeModel m, String db) { assert(db != null); if (db.startsWith("/uniprot/")) { m_db = CacheableUniProtRecord.UniProtDatabase.UNIPROT_KB; } else if (db.startsWith("/uniref/")) { m_db = CacheableUniProtRecord.UniProtDatabase.UNIREF; } else if (db.startsWith("/uniparc/")) { m_db = CacheableUniProtRecord.UniProtDatabase.UNIPARC; } else { m_db = CacheableUniProtRecord.UniProtDatabase.UNKNOWN; } m_hit = 1; File objcache = m.getCacheFile(); m_cache_freshness = m.getCacheFreshness(); // try to create/use an existing cache or switch it off... try { m_cache = null; if (objcache != null) { m_cache_mgr = new CacheManager(); Cache c = new Cache(new CacheConfiguration("uniprot", MAX_UNIPROT_CACHE_ELEMENTS) .memoryStoreEvictionPolicy(MemoryStoreEvictionPolicy.LFU) .overflowToDisk(true) .eternal(true) .timeToLiveSeconds(m_cache_freshness * 24 * 60 * 60) .diskPersistent(true) .diskStorePath(m.getCacheFile().getAbsolutePath()) ); m_cache_mgr.addCache(c); m_cache = m_cache_mgr.getCache("uniprot"); // ensures the cache is live... } } catch (Exception e) { e.printStackTrace(); } } public DataTableSpec getTableSpec(boolean want_xml) { m_want_xml = want_xml; int ncols = NUM_COLUMNS; if (m_want_xml) ncols++; DataColumnSpec[] appended_col_specs = new DataColumnSpec[ncols]; DataType dt = ListCell.getCollectionType(StringCell.TYPE); appended_col_specs[0] = new DataColumnSpecCreator("UniProt ID", StringCell.TYPE).createSpec(); appended_col_specs[1] = new DataColumnSpecCreator("UniProt Recommended Protein Name", StringCell.TYPE).createSpec(); appended_col_specs[2] = new DataColumnSpecCreator("UniProt X-Refs", dt).createSpec(); appended_col_specs[3] = new DataColumnSpecCreator("UniProt Sequence", StringCell.TYPE).createSpec(); appended_col_specs[4] = new DataColumnSpecCreator("UniProt Gene (primary)", StringCell.TYPE).createSpec(); appended_col_specs[5] = new DataColumnSpecCreator("UniProt Organism", StringCell.TYPE).createSpec(); appended_col_specs[6] = new DataColumnSpecCreator("UniProt Comments", dt).createSpec(); appended_col_specs[7] = new DataColumnSpecCreator("UniProt Organelles", dt).createSpec(); appended_col_specs[8] = new DataColumnSpecCreator("UniProt Taxon", dt).createSpec(); appended_col_specs[9] = new DataColumnSpecCreator("UniProt Keywords", dt).createSpec(); appended_col_specs[10]= new DataColumnSpecCreator("UniProt Protein Existence Evidence", StringCell.TYPE).createSpec(); appended_col_specs[11]= new DataColumnSpecCreator("UniProt Features", dt).createSpec(); appended_col_specs[12]= new DataColumnSpecCreator("UniProt Citations", dt).createSpec(); if (m_want_xml) { appended_col_specs[13]= new DataColumnSpecCreator("XML", XMLCell.TYPE).createSpec(); } return new DataTableSpec(appended_col_specs); } @Override public int run(String[] accsns, DataRow[] in_rows, DataContainer out) throws Exception { assert(accsns != null && out != null); if (m_db != CacheableUniProtRecord.UniProtDatabase.UNKNOWN) { // one of UNIPROT_KB, UNIPARC or UNIREF HttpClient cli = new HttpClient(); int n_hits = 0; int idx = 0; m_fetched = 0; // how many records in batch needed to be fetched? for (final String accsn : accsns) { DataCell[] cells = null; boolean get_via_www = true; // 1. try to find a suitable object in the cache and use it where possible... if (m_cache != null) { String key = CacheableUniProtRecord.makeKey(m_db, accsn); Element e = m_cache.get(key); if (e != null) { Object v = e.getObjectValue(); // NB: this is not serialisable assert(v != null); cells = grok_entry(((CacheableUniProtRecord)v).getXML()); get_via_www = false; } // else FALLTHRU... } // 2. else not in the cache so fetch it from uniprot... if (get_via_www) { String db_str = "uniprot"; if (m_db == CacheableUniProtRecord.UniProtDatabase.UNIREF) db_str = "uniref"; else if (m_db == CacheableUniProtRecord.UniProtDatabase.UNIPARC) db_str = "uniparc"; m_fetched++; // cells must be null if no data obtained for the row (for any reason) for (int retry=1; retry < MAX_RETRIES; retry++) { URL url = new URL("http://www.uniprot.org/"+db_str+"/"+accsn+".xml"); Logger.getAnonymousLogger().info(url.toString()); try { URLConnection conn = url.openConnection(); InputStream is = conn.getInputStream(); cells = grok_entry(m_cache, accsn, is); is.close(); break; } catch (DeletedEntryException de) { Logger.getAnonymousLogger().warning("Entry "+accsn+" appears to be deleted. Ignoring."); cells = null; idx++; // no output for this row break; } catch (SocketTimeoutException ste) { int delay = retry * 100; Logger.getAnonymousLogger().warning("Entry "+accsn+" timed out getting XML (network problem?). Retrying in "+delay+" seconds"); ste.printStackTrace(); Thread.sleep(delay*1000); cells = null; } catch (FileNotFoundException fe) { // maybe a bad accsn and we tried to fetch it? Logger.getAnonymousLogger().warning("Entry "+accsn+" can not be found. Are you sure the accession column is configured?"); cells = null; idx++; // no output for this row break; } catch (IOException e) { e.printStackTrace(); } } Thread.sleep(3 * 1000); // be nice to uniprot servers between accsns } // output current row (if any) if (cells != null) { if (in_rows != null) { DataRow r = new DefaultRow(in_rows[idx].getKey(), cells); // NB: cells must match tablespec out.addRowToTable(new JoinedRow(in_rows[idx++], r)); n_hits++; } else { out.addRowToTable(new DefaultRow("Hit"+m_hit++, cells)); n_hits++; } } } return n_hits; } else { throw new Exception("Unsupported operation: "+m_db); } } protected DataCell list2listcell(List<String> data) { if (data == null || data.size() < 1) { return DataType.getMissingCell(); } ArrayList<StringCell> coll = new ArrayList<StringCell>(); for (int i=0; i<data.size(); i++) { coll.add(new StringCell(data.get(i))); } return CollectionCellFactory.createListCell(coll); } protected DataCell safe_string(String datum) { if (datum == null) return DataType.getMissingCell(); return new StringCell(datum); } public String fix_accsn(String in_accsn) throws Exception { String tmp = in_accsn.trim(); if (tmp.startsWith("UniRef")) { Pattern p = Pattern.compile("^UniRef(50|90|100)_(.*+)$"); Matcher m = p.matcher(tmp); if (m.matches()) { String db = m.group(1); tmp = m.group(2); // only want the accsn, not the uniref db identifier } } return tmp; } /** * Template function to turn the stream into a string for subsequent processing. XML is limited in size (by the record sizes) from UniProt so this * shouldn't be too wasteful of memory. This method also stores the retrieve XML in a DB4O cache to speed subsequent access (eg. if more uniprot work is done later in the * KNIME workflow) * * @param xml_stream * @param cache the db4o-compatible object container to store the record into * @return * @throws Exception */ protected final DataCell[] grok_entry(Cache cache, String accsn, InputStream response_stream) throws Exception { String fixed_xml = UniProtHit.xml2string(response_stream, true); if (cache != null && fixed_xml != null && fixed_xml.length() > 0) { CacheableUniProtRecord rec = new CacheableUniProtRecord(m_db, accsn, fixed_xml); cache.put(new Element(rec.getKey(), rec), false); rec = null; } return grok_entry(fixed_xml); } /** * Override this method to implement a custom response to the retrieved record. Note that this method <b>DOES NOT</b> cache the record, * use the other form of the method for that * * @param xml a single, complete, XML record from UniProtKB/UniPARC/UniRef (should be well-formed XML or an exception is the likely result) * @return the cells representing the results for the current record (must match the columns specified by <code>getTableSpec()</code>) * @throws Exception */ protected DataCell[] grok_entry(String xml) throws Exception { List<UniProtHit> hits = null; try { hits = UniProtHit.make_entries(xml); } catch (CantParseDocumentException cpde) { // BUG: this exception tends to be thrown with < occurs in an id attribute from a // UniProt entry. I dont think the XML is invalid, but the UniProtHit parsing code doesnt handle it for now... Logger.getAnonymousLogger().warning("Cannot parse XML: ignoring data (bug)!"); // fallthru: return missing hits = null; } catch (DeletedEntryException dee) { // dont do a stack trace here... throw dee; } catch (Exception e) { e.printStackTrace(); throw e; } int ncols = NUM_COLUMNS; if (m_want_xml) { ncols++; } DataCell[] cells = new DataCell[ncols]; for (int i=0; i<cells.length; i++) { cells[i] = DataType.getMissingCell(); } if (hits == null || hits.size() < 1) { return cells; } else if (hits.size() > 1) { Logger.getAnonymousLogger().warning("Multiple choices available: entry looks to have been replaced, only first choice is reported."); } UniProtHit hit = hits.get(0); cells[0] = safe_string(hit.getID()); cells[1] = safe_string(hit.getRecommendedName()); cells[2] = list2listcell(hit.getXrefs()); cells[3] = safe_string(hit.getSequence()); cells[4] = safe_string(hit.getGenePrimary()); cells[5] = safe_string(hit.getOrganism()); cells[6] = list2listcell(hit.getComments()); cells[7] = DataType.getMissingCell(); cells[8] = list2listcell(hit.getLineage()); cells[9] = list2listcell(hit.getKeywords()); cells[10]= safe_string(hit.getExistenceEvidence()); cells[11]= list2listcell(hit.getFeatures()); cells[12]= list2listcell(hit.getCitations()); if (m_want_xml) { cells[13]= new XMLCell(xml); } return cells; } @Override public void cleanup() throws Exception { if (m_cache != null) { m_cache.flush(); m_cache_mgr.shutdown(); } } @Override public void pause(ExecutionContext exec, double progress, String msg) throws InterruptedException, CanceledExecutionException { exec.checkCanceled(); if (m_fetched >= 20) { exec.setProgress(progress, "Pause to be nice to UniProt servers (20sec. delay)"); Thread.sleep(20 * 1000); m_fetched = 0; } else { exec.setProgress(progress); } } }